2. Dataset quick summary (counts, summary, data types, structure)
# Let's explore data dimensions
cat('Dataset dimensions:\n')
## Dataset dimensions:
cat(paste0("Rows: ", dim(data)[1], "\n"))
## Rows: 129971
cat(paste0("Columns: ", dim(data)[2], "\n"))
## Columns: 14
# NULL values (total & for each column)
cat(paste0("NULL values count: ", sum(is.na(data)), "\n"))
## NULL values count: 8996
cat("NULL values on each column:\n")
## NULL values on each column:
sapply(data, function(col) sum(is.na(col)))
## X country description
## 0 0 0
## designation points price
## 0 0 8996
## province region_1 region_2
## 0 0 0
## taster_name taster_twitter_handle title
## 0 0 0
## variety winery
## 0 0
# Print small data summary
cat('Summary:\n')
## Summary:
data %>% summary()
## X country description designation
## Min. : 0 Length:129971 Length:129971 Length:129971
## 1st Qu.: 32492 Class :character Class :character Class :character
## Median : 64985 Mode :character Mode :character Mode :character
## Mean : 64985
## 3rd Qu.: 97478
## Max. :129970
##
## points price province region_1
## Min. : 80.00 Min. : 4.00 Length:129971 Length:129971
## 1st Qu.: 86.00 1st Qu.: 17.00 Class :character Class :character
## Median : 88.00 Median : 25.00 Mode :character Mode :character
## Mean : 88.45 Mean : 35.36
## 3rd Qu.: 91.00 3rd Qu.: 42.00
## Max. :100.00 Max. :3300.00
## NA's :8996
## region_2 taster_name taster_twitter_handle title
## Length:129971 Length:129971 Length:129971 Length:129971
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## variety winery
## Length:129971 Length:129971
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
data %>% glimpse()
## Rows: 129,971
## Columns: 14
## $ X <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…
## $ country <chr> "Italy", "Portugal", "US", "US", "US", "Spain",…
## $ description <chr> "Aromas include tropical fruit, broom, brimston…
## $ designation <chr> "Vulkà Bianco", "Avidagos", "", "Reserve Late H…
## $ points <int> 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87,…
## $ price <dbl> NA, 15, 14, 13, 65, 15, 16, 24, 12, 27, 19, 30,…
## $ province <chr> "Sicily & Sardinia", "Douro", "Oregon", "Michig…
## $ region_1 <chr> "Etna", "", "Willamette Valley", "Lake Michigan…
## $ region_2 <chr> "", "", "Willamette Valley", "", "Willamette Va…
## $ taster_name <chr> "Kerin O’Keefe", "Roger Voss", "Paul Gregutt", …
## $ taster_twitter_handle <chr> "@kerinokeefe", "@vossroger", "@paulgwine ", ""…
## $ title <chr> "Nicosia 2013 Vulkà Bianco (Etna)", "Quinta do…
## $ variety <chr> "White Blend", "Portuguese Red", "Pinot Gris", …
## $ winery <chr> "Nicosia", "Quinta dos Avidagos", "Rainstorm", …
# Also extract columns data types by applying the "class" function on each column
cat('Columns data types:\n')
## Columns data types:
sapply(data, class)
## X country description
## "integer" "character" "character"
## designation points price
## "character" "integer" "numeric"
## province region_1 region_2
## "character" "character" "character"
## taster_name taster_twitter_handle title
## "character" "character" "character"
## variety winery
## "character" "character"
# Explore our two main numerical variables (points & price)
# Wine prices
# Long range of prices, but concentrated on the lower pricing tier
wines_price_distribution <- data %>%
ggplot(aes(x = price)) +
geom_histogram(aes(y = ..density..), fill = 'red', colour = 'darkred', bins = 100) +
geom_vline(aes(xintercept = mean(points)), color = "darkblue", linetype="dashed", size = 1) +
theme_minimal() +
labs(title = 'Wines Price Distribution - Price & Probability + Mean', x = 'X = Price', y = 'Y = Probability')
ggplotly(wines_price_distribution)
## Warning: Removed 8996 rows containing non-finite values (stat_bin).
# Wine points
# Integers, between 80 and 100
wines_points_distribution <- data %>%
ggplot(aes(x = points)) +
geom_histogram(aes(y = ..density..), fill = 'lightblue', colour = "darkblue", bins = 21, template = "seaborn") +
geom_vline(aes(xintercept = mean(points)), color = "darkred", linetype="dashed", size = 1) +
theme_minimal() +
labs(title = 'Wines Points Distribution - Points & Probability + Mean', x = 'X = Points', y = 'Y = Probability')
## Warning: Ignoring unknown parameters: template
ggplotly(wines_points_distribution)